# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import pandas_bokeh
import plotly.express as px
pandas_bokeh.output_notebook()
%matplotlib inline
df = pd.read_csv('outbreak_all_data_new.csv')
df['reporting_date']=pd.to_datetime(df.reporting_date)
df['month_year'] = df['reporting_date'].dt.to_period('M').astype('string')
df.fillna(0,inplace=True)
#make data_frame of aggregates for each disease
diseases_sum = df.groupby('disease').sum().reset_index()
#calculate case fatality for human and animals
diseases_sum['human_case_fatality']= diseases_sum.humans_deaths/diseases_sum.humans_affected
diseases_sum['animal_case_fatality']= diseases_sum.sum_deaths/diseases_sum.sum_cases
#make data_frame of aggregates for each country
country_sum = df.groupby('country').sum().reset_index()
country_sum['human_case_fatality']= country_sum.humans_deaths/country_sum.humans_affected
country_sum['animal_case_fatality']= country_sum.sum_deaths/country_sum.sum_cases
print ('this data set has '+ str(df.shape[1]) +' columns and ' + str(df.shape[0]) + ' rows')
this data set has 25 columns and 90000 rows
the dataset has diseases incedence and there human and animal infections or deathes
the data include location and and could help mab out feature
df.humans_affected.plot_bokeh(kind='hist',bins = range(0,501,10),vertical_xlabel = True,figsize=(1000,500),logy=True,ylim=(0.1,df.shape[0]),zooming =False)
let's dig deeper
df.humans_affected.plot_bokeh(kind='hist',vertical_xlabel = True,xlim = (0,20),bins = range(0,21),logy=True,ylim=(0.1,df.shape[0]),zooming =False)
print('the mode of humans_affected is '+ str(int(df.humans_affected.mode()))+ '\nmax humans_affected is '+str(int(df.humans_affected.max()))+'\nhumans_affected column has : '+str(df.humans_affected.isnull().sum())+' nulls')
the mode of humans_affected is 0 max humans_affected is 495 humans_affected column has : 0 nulls
most of the records of human affected if existed is 1 which is a good thing but most of this column is null and I think null mean Zero so I filled it with Zero there is also outliers where we have around 500 humans affected which raise a concern we need to see which diseases cause that
df.humans_age.plot_bokeh(kind='hist',bins=range(0,100),figsize=(1000,500),vertical_xlabel = True,logy=True,ylim=(0.1,df.shape[0]),zooming =False)
print('the mode of humans_age is '+ str(int(df.humans_age.mode()))+ '\nmax humans_age is '+str(int(df.humans_age.max()))+'\nhumans_age column has : '+str(df.humans_age.isnull().sum())+' nulls')
the mode of humans_age is 0 max humans_age is 96 humans_age column has : 0 nulls
most of human's cases are less than 1 year age or they are Zero because most of them was null
df.humans_deaths.plot_bokeh(kind='hist',figsize=(1000,500),vertical_xlabel = True,bins =range(0,137,2),logy=True,ylim=(0.1,df.shape[0]),zooming =False)
print('the mode of humans_deaths is '+ str(int(df.humans_deaths.mode()))+ '\nmax humans_deaths is '+str(int(df.humans_deaths.max()))+'\nhumans_deaths column has : '+str(df.humans_deaths.isnull().sum())+' nulls')
the mode of humans_deaths is 0 max humans_deaths is 134 humans_deaths column has : 0 nulls
the mode of humans_deaths if existed is 1, some times there is high number of deaths which raise concern we need to see which diseases cause that and most of the column is null which I think mean Zero so I filled it with Zero
df.sum_cases.plot_bokeh(kind='hist',vertical_xlabel = True,bins=range(0,900001,10000),figsize=(1000,500),logy=True,ylim=(0.1,df.shape[0]),zooming =False)
let's dig deeper
df.sum_cases.plot_bokeh(kind='hist',vertical_xlabel = True,xlim=(0,1000),bins=range(0,1000,10),figsize=(1000,500),logy=True,ylim=(0.1,df.shape[0]),zooming =False)
let's dig deeper
df.sum_cases.plot_bokeh(kind='hist',vertical_xlabel = True,xlim=(0,100),bins=range(0,101),figsize=(1000,500),logy=True,ylim=(0.1,df.shape[0]),zooming =False)
print('the mode of sum_cases of animals is '+ str(int(df.sum_cases.mode()))+ '\nmax sum_cases of animals is '+str(int(df.sum_cases.max()))+'\nsum_cases column has : '+str(df.sum_cases.isnull().sum())+' nulls')
the mode of sum_cases of animals is 1 max sum_cases of animals is 800000 sum_cases column has : 0 nulls
most of sum_cases of animal is less than 10 and the mode is 1 which is a good thing but there is cases where it is too large at 800K and I think those cases are in poultry, we need to see which diseases cause that
df.sum_destroyed.plot_bokeh(kind='hist',vertical_xlabel = True,figsize=(1000,500),bins=range(0,3750000,50000),logy=True,ylim=(0.1,df.shape[0]),zooming =False)
let's dig deeper
df.sum_destroyed.plot_bokeh(kind='hist',vertical_xlabel = True,figsize=(1000,500),bins=range(0,50000,500),xlim=(0,50000),logy=True,ylim=(0.1,df.shape[0]),zooming =False)
let's dig deeper
df.sum_destroyed.plot_bokeh(kind='hist',vertical_xlabel = True,figsize=(1000,500),bins=range(0,500,5),xlim=(0,500),logy=True,ylim=(0.1,df.shape[0]),zooming =False)
print('the mode of sum_destroyed of animals is '+ str(int(df.sum_destroyed.mode()))+ '\nmax sum_destroyed of animals is '+str(int(df.sum_destroyed.max()))+'\nsum_destroyed column has : '+str(df.sum_destroyed.isnull().sum())+' nulls')
the mode of sum_destroyed of animals is 0 max sum_destroyed of animals is 3660000 sum_destroyed column has : 0 nulls
sum_destroyed sometimes reach realy high numbers as 3.66 million which raise concerne and I think those cases are in poultry, we need to see which diseases cause that, and I think those nulls mean Zero so I filled them with Zero
df.sum_slaughtered.plot_bokeh(kind='hist',vertical_xlabel = True,bins=range(0,900000,10000),figsize=(1000,500),logy=True,ylim=(0.1,df.shape[0]),zooming =False)
let's dig deeper
df.sum_slaughtered.plot_bokeh(kind='hist',vertical_xlabel = True,bins=range(0,10000,100),figsize=(1000,500),xlim=(0,10000),logy=True,ylim=(0.1,df.shape[0]),zooming =False)
let's dig deeper
df.sum_slaughtered.plot_bokeh(kind='hist',vertical_xlabel = True,bins=range(0,100,1),figsize=(1000,500),xlim=(0,100),logy=True,ylim=(0.1,df.shape[0]),zooming =False)
print('the mode of sum_slaughtered of animals is '+ str(int(df.sum_slaughtered.mode()))+ '\nmax sum_slaughtered of animals is '+str(int(df.sum_slaughtered.max()))+'\nsum_slaughtered column has : '+str(df.sum_slaughtered.isnull().sum())+' nulls')
the mode of sum_slaughtered of animals is 0 max sum_slaughtered of animals is 850816 sum_slaughtered column has : 0 nulls
most of the column is null and I think it mean Zero as the mode but there is high outliers reaching near a million which need more investigation
df.disease.value_counts().sort_values().plot_bokeh(kind='barh',figsize=(950,500),legend ='bottom_right')
the highest count is for Avian_influenza flowed by African swine fever and viral diseases are the highest incidence
df.country.value_counts().sort_values().plot_bokeh(kind='barh',figsize=(950,1750),legend ='bottom_right',zooming=False)
the highest countries by order are Poland, China and that is expected, Romania, France, Indonesia, Greece, and Egypt
df.region.value_counts().sort_values().plot_bokeh(kind='barh',legend ='bottom_right',zooming=False)
Europe has the most records flowed by Asia
there were high outliers which need more investigation and I did not perform any changes but the columns' names
df.groupby('disease').humans_affected.sum().sort_values().plot_bokeh(kind='barh',zooming =False,legend ='bottom_right',figsize=(950,500))
df.groupby('disease').humans_deaths.sum().sort_values().plot_bokeh(kind='barh',zooming =False,legend ='bottom_right',figsize=(950,500))
df.groupby('month_year').humans_affected.sum().plot_bokeh(kind='line',figsize=(2000,500),vertical_xlabel = True,zooming=False)
px.bar(diseases_sum.fillna(0).query('human_case_fatality != 0').sort_values(['human_case_fatality'],ascending=False),x='disease',y='human_case_fatality')
df.groupby('disease').sum_cases.sum().sort_values().plot_bokeh(kind='barh',zooming =False,legend ='bottom_right',figsize=(950,500),disable_scientific_axes='x')
let's dig without poultry diseases and African swine fever
df.query('disease !="Influenza - Avian" and disease !="Newcastle disease" and disease !="African swine fever"')\
.groupby('disease').sum_cases.sum().sort_values().plot_bokeh(kind='barh',zooming =False,legend ='bottom_right',figsize=(950,500),disable_scientific_axes='x')
df.groupby('disease').sum_deaths.sum().sort_values().plot_bokeh(kind='barh',zooming =False,legend ='bottom_right',figsize=(950,500),disable_scientific_axes='x')
let's dig without poultry diseases and African swine fever
df.query('disease !="Influenza - Avian" and disease !="Newcastle disease" and disease !="African swine fever"')\
.groupby('disease').sum_deaths.sum().sort_values().plot_bokeh(kind='barh',zooming =False,legend ='bottom_right',figsize=(950,500),disable_scientific_axes='x')
df.groupby('disease').sum_slaughtered.sum().sort_values().plot_bokeh(kind='barh',zooming =False,legend ='bottom_right',figsize=(950,500),disable_scientific_axes='x')
let's dig without poultry diseases and African swine fever
df.query('disease !="Influenza - Avian" and disease !="Newcastle disease" and disease !="African swine fever"')\
.groupby('disease').sum_slaughtered.sum().sort_values().plot_bokeh(kind='barh',zooming =False,legend ='bottom_right',figsize=(950,500),disable_scientific_axes='x')
df.groupby('disease').sum_destroyed.sum().sort_values().plot_bokeh(kind='barh',zooming =False,legend ='bottom_right',figsize=(950,500),disable_scientific_axes='x')
let's dig without poultry diseases and African swine fever
df.query('disease !="Influenza - Avian" and disease !="Newcastle disease" and disease !="African swine fever"')\
.groupby('disease').sum_destroyed.sum().sort_values().plot_bokeh(kind='barh',zooming =False,legend ='bottom_right',figsize=(950,500),disable_scientific_axes='x')
poultry diseases dominate the numbers followed by African swine fever and other diseases of pig and ruminant
df.groupby('month_year').sum_cases.sum().plot_bokeh(kind='line',figsize=(2000,500),vertical_xlabel = True,zooming=False)
there are two spikes in the year 2006 and 2012
px.bar(diseases_sum.fillna(0).query('animal_case_fatality != 0').sort_values(['animal_case_fatality']),x='animal_case_fatality',y='disease',width=950,height=650)
the highest case fatality is for Schmallenberg followed by Anthrax
df.groupby('country').humans_affected.sum().sort_values().plot_bokeh(kind='barh',zooming =False,legend ='bottom_right',figsize=(950,1750))
df.groupby('country').humans_deaths.sum().sort_values().plot_bokeh(kind='barh',zooming =False,legend ='bottom_right',figsize=(950,1750))
px.bar(country_sum.fillna(0).query('human_case_fatality !=0').sort_values(['human_case_fatality']),x='human_case_fatality',y='country',height=1000)
No
px.line(df.groupby(['month_year','disease']).sum().reset_index(),x='month_year',y='sum_cases',color='disease')
as we can see the spikes are caused by poultry diseases mostly
px.bar(diseases_sum,x='disease',y=[diseases_sum.sum_cases,diseases_sum.sum_deaths],barmode='group',log_y=True)
some diseases have vey high cases and deaths as Avian Influenza and Newcastle disease while most have both of them low but MERS-CoV have has zero deaths for animal desbite it's high fatality in humans
#select records that are not zero to represent
dise_sum_human = diseases_sum.query('humans_affected != 0')
px.bar(dise_sum_human,x='disease',y=[dise_sum_human.humans_affected,dise_sum_human.humans_deaths],barmode='group',log_y=True)
px.line(df.groupby(['month_year','disease']).sum().query('humans_affected != 0').reset_index(),x='month_year',y='humans_affected',color='disease')
#select records that are not zero to represent
country_sum_human = country_sum.query('humans_affected != 0')
px.bar(country_sum_human,y='country',x=[country_sum_human.humans_affected,country_sum_human.humans_deaths],barmode='group',height=1750,log_x=True)